import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# 1A. Reading the Churn_1 csv file
churn1_df = pd.read_csv("TelcomCustomer-Churn_1.csv")
churn1_df.head()
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No |
#1B. Reading the Churn_2 csv file
churn2_df = pd.read_csv("TelcomCustomer-Churn_2.csv")
churn2_df.head()
| customerID | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
print("Churn 1",churn1_df.count())
print("")
print("Churn 2",churn2_df.count())
Churn 1 customerID 7043 gender 7043 SeniorCitizen 7043 Partner 7043 Dependents 7043 tenure 7043 PhoneService 7043 MultipleLines 7043 InternetService 7043 OnlineSecurity 7043 dtype: int64 Churn 2 customerID 7043 OnlineBackup 7043 DeviceProtection 7043 TechSupport 7043 StreamingTV 7043 StreamingMovies 7043 Contract 7043 PaperlessBilling 7043 PaymentMethod 7043 MonthlyCharges 7043 TotalCharges 7043 Churn 7043 dtype: int64
#1C.... Both the dataframes have equal number of rows 7043 and they have to be merged horizontally on the Customer ID
comb_df1 = pd.merge(churn1_df, churn2_df, on='customerID', indicator=True)
comb_df1 #............... _merge column indicates both the dataframes are merged
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | _merge | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No | both |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No | both |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes | both |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No | both |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes | both |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 6840-RESVB | Male | 0 | Yes | Yes | 24 | Yes | Yes | DSL | Yes | ... | Yes | Yes | Yes | One year | Yes | Mailed check | 84.80 | 1990.5 | No | both |
| 7039 | 2234-XADUH | Female | 0 | Yes | Yes | 72 | Yes | Yes | Fiber optic | No | ... | No | Yes | Yes | One year | Yes | Credit card (automatic) | 103.20 | 7362.9 | No | both |
| 7040 | 4801-JZAZL | Female | 0 | Yes | Yes | 11 | No | No phone service | DSL | Yes | ... | No | No | No | Month-to-month | Yes | Electronic check | 29.60 | 346.45 | No | both |
| 7041 | 8361-LTMKD | Male | 1 | Yes | No | 4 | Yes | Yes | Fiber optic | No | ... | No | No | No | Month-to-month | Yes | Mailed check | 74.40 | 306.6 | Yes | both |
| 7042 | 3186-AJIEK | Male | 0 | No | No | 66 | Yes | No | Fiber optic | Yes | ... | Yes | Yes | Yes | Two year | Yes | Bank transfer (automatic) | 105.65 | 6844.5 | No | both |
7043 rows × 22 columns
comb_df = pd.merge(churn1_df, churn2_df, on='customerID')
#1D. All the columns have been added to the new dataframes
print("churn1 shape",churn1_df.shape)
print("churn2 shape",churn2_df.shape)
print("combined df",comb_df.shape)
churn1 shape (7043, 10) churn2 shape (7043, 12) combined df (7043, 21)
churn1_df.eq(churn2_df).all(axis=1)
0 False
1 False
2 False
3 False
4 False
...
7038 False
7039 False
7040 False
7041 False
7042 False
Length: 7043, dtype: bool
churn1_df.columns
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity'],
dtype='object')
churn2_df.columns
Index(['customerID', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
dtype='object')
comb_df.columns
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
dtype='object')
comb_df.dtypes
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges object Churn object dtype: object
comb_df.describe()
| SeniorCitizen | tenure | MonthlyCharges | |
|---|---|---|---|
| count | 7043.000000 | 7043.000000 | 7043.000000 |
| mean | 0.162147 | 32.371149 | 64.761692 |
| std | 0.368612 | 24.559481 | 30.090047 |
| min | 0.000000 | 0.000000 | 18.250000 |
| 25% | 0.000000 | 9.000000 | 35.500000 |
| 50% | 0.000000 | 29.000000 | 70.350000 |
| 75% | 0.000000 | 55.000000 | 89.850000 |
| max | 1.000000 | 72.000000 | 118.750000 |
comb_df.head()
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
| 1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | ... | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
| 2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | ... | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
| 3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | ... | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
| 4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | ... | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
5 rows × 21 columns
#2A.... Impute missing/unexpected values....
comb_df.isnull().sum()
customerID 0 gender 0 SeniorCitizen 0 Partner 0 Dependents 0 tenure 0 PhoneService 0 MultipleLines 0 InternetService 0 OnlineSecurity 0 OnlineBackup 0 DeviceProtection 0 TechSupport 0 StreamingTV 0 StreamingMovies 0 Contract 0 PaperlessBilling 0 PaymentMethod 0 MonthlyCharges 0 TotalCharges 0 Churn 0 dtype: int64
comb_df.TotalCharges = pd.to_numeric(comb_df.TotalCharges, errors="coerce")
comb_df.dtypes
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges float64 Churn object dtype: object
comb_df.describe()
| SeniorCitizen | tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|---|
| count | 7043.000000 | 7043.000000 | 7043.000000 | 7032.000000 |
| mean | 0.162147 | 32.371149 | 64.761692 | 2283.300441 |
| std | 0.368612 | 24.559481 | 30.090047 | 2266.771362 |
| min | 0.000000 | 0.000000 | 18.250000 | 18.800000 |
| 25% | 0.000000 | 9.000000 | 35.500000 | 401.450000 |
| 50% | 0.000000 | 29.000000 | 70.350000 | 1397.475000 |
| 75% | 0.000000 | 55.000000 | 89.850000 | 3794.737500 |
| max | 1.000000 | 72.000000 | 118.750000 | 8684.800000 |
comb_df['TotalCharges'].isnull().sum() #......... there are 11 missing values in TotalCharges
11
comb_df.isnull().values.any()
True
comb_df.isnull().sum().sum()
11
df1 = comb_df[comb_df.isna().any(axis=1)]
df1 #........... these are the rows with missing TotalCharges values
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 488 | 4472-LVYGI | Female | 0 | Yes | Yes | 0 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | Yes | Bank transfer (automatic) | 52.55 | NaN | No |
| 753 | 3115-CZMZD | Male | 0 | No | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.25 | NaN | No |
| 936 | 5709-LVOEQ | Female | 0 | Yes | Yes | 0 | Yes | No | DSL | Yes | ... | Yes | No | Yes | Yes | Two year | No | Mailed check | 80.85 | NaN | No |
| 1082 | 4367-NUYAO | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.75 | NaN | No |
| 1340 | 1371-DWPAZ | Female | 0 | Yes | Yes | 0 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | No | Credit card (automatic) | 56.05 | NaN | No |
| 3331 | 7644-OMVMY | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 19.85 | NaN | No |
| 3826 | 3213-VVOLG | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.35 | NaN | No |
| 4380 | 2520-SGTTA | Female | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.00 | NaN | No |
| 5218 | 2923-ARZLG | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | One year | Yes | Mailed check | 19.70 | NaN | No |
| 6670 | 4075-WKNIU | Female | 0 | Yes | Yes | 0 | Yes | Yes | DSL | No | ... | Yes | Yes | Yes | No | Two year | No | Mailed check | 73.35 | NaN | No |
| 6754 | 2775-SEFEE | Male | 0 | No | Yes | 0 | Yes | Yes | DSL | Yes | ... | No | Yes | No | No | Two year | Yes | Bank transfer (automatic) | 61.90 | NaN | No |
11 rows × 21 columns
comb_df.sort_values(by="TotalCharges")
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1654 | 2967-MXRAV | Male | 0 | Yes | Yes | 1 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | One year | No | Mailed check | 18.80 | 18.80 | No |
| 6489 | 9318-NKNFC | Male | 0 | No | No | 1 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | Yes | Mailed check | 18.85 | 18.85 | Yes |
| 1151 | 8992-CEUEN | Female | 0 | No | No | 1 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | No | Electronic check | 18.85 | 18.85 | No |
| 4939 | 9975-SKRNR | Male | 0 | No | No | 1 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | No | Mailed check | 18.90 | 18.90 | No |
| 583 | 1423-BMPBQ | Female | 0 | Yes | Yes | 1 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Month-to-month | Yes | Mailed check | 19.00 | 19.00 | No |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3826 | 3213-VVOLG | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.35 | NaN | No |
| 4380 | 2520-SGTTA | Female | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.00 | NaN | No |
| 5218 | 2923-ARZLG | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | One year | Yes | Mailed check | 19.70 | NaN | No |
| 6670 | 4075-WKNIU | Female | 0 | Yes | Yes | 0 | Yes | Yes | DSL | No | ... | Yes | Yes | Yes | No | Two year | No | Mailed check | 73.35 | NaN | No |
| 6754 | 2775-SEFEE | Male | 0 | No | Yes | 0 | Yes | Yes | DSL | Yes | ... | No | Yes | No | No | Two year | Yes | Bank transfer (automatic) | 61.90 | NaN | No |
7043 rows × 21 columns
# Replace using median
median = comb_df['TotalCharges'].median()
comb_df['TotalCharges'].fillna(median, inplace=True)
comb_df['TotalCharges'].isnull().sum() #........... after replacing with median........ there are no more missing values
0
comb_df.describe()
| SeniorCitizen | tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|---|
| count | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 |
| mean | 0.162147 | 32.371149 | 64.761692 | 2281.916928 |
| std | 0.368612 | 24.559481 | 30.090047 | 2265.270398 |
| min | 0.000000 | 0.000000 | 18.250000 | 18.800000 |
| 25% | 0.000000 | 9.000000 | 35.500000 | 402.225000 |
| 50% | 0.000000 | 29.000000 | 70.350000 | 1397.475000 |
| 75% | 0.000000 | 55.000000 | 89.850000 | 3786.600000 |
| max | 1.000000 | 72.000000 | 118.750000 | 8684.800000 |
#2A........... Other than TotalCharges column all other columns had no missing values or null values....
# But those Null values in Total Charges were not being picked up by pandas initially and was able to find it after
# the column was changed to_numeric
#... There were 11 Null values in TotalCharges column and they have been replaced by Median values.
for i in comb_df.columns:
if ((i in ["customerID","tenure","MonthlyCharges","TotalCharges"])):
print()
else:
print("% S unique values", i)
print(comb_df[i].value_counts())
print("")
% S unique values gender Male 3555 Female 3488 Name: gender, dtype: int64 % S unique values SeniorCitizen 0 5901 1 1142 Name: SeniorCitizen, dtype: int64 % S unique values Partner No 3641 Yes 3402 Name: Partner, dtype: int64 % S unique values Dependents No 4933 Yes 2110 Name: Dependents, dtype: int64 % S unique values PhoneService Yes 6361 No 682 Name: PhoneService, dtype: int64 % S unique values MultipleLines No 3390 Yes 2971 No phone service 682 Name: MultipleLines, dtype: int64 % S unique values InternetService Fiber optic 3096 DSL 2421 No 1526 Name: InternetService, dtype: int64 % S unique values OnlineSecurity No 3498 Yes 2019 No internet service 1526 Name: OnlineSecurity, dtype: int64 % S unique values OnlineBackup No 3088 Yes 2429 No internet service 1526 Name: OnlineBackup, dtype: int64 % S unique values DeviceProtection No 3095 Yes 2422 No internet service 1526 Name: DeviceProtection, dtype: int64 % S unique values TechSupport No 3473 Yes 2044 No internet service 1526 Name: TechSupport, dtype: int64 % S unique values StreamingTV No 2810 Yes 2707 No internet service 1526 Name: StreamingTV, dtype: int64 % S unique values StreamingMovies No 2785 Yes 2732 No internet service 1526 Name: StreamingMovies, dtype: int64 % S unique values Contract Month-to-month 3875 Two year 1695 One year 1473 Name: Contract, dtype: int64 % S unique values PaperlessBilling Yes 4171 No 2872 Name: PaperlessBilling, dtype: int64 % S unique values PaymentMethod Electronic check 2365 Mailed check 1612 Bank transfer (automatic) 1544 Credit card (automatic) 1522 Name: PaymentMethod, dtype: int64 % S unique values Churn No 5174 Yes 1869 Name: Churn, dtype: int64
for i in comb_df.columns:
if ((i in ["customerID","tenure","MonthlyCharges","TotalCharges"])):
print()
else:
fig = plt.figure(figsize = (10,5))
plt.rcParams.update({'figure.max_open_warning': 0})
ax = fig.gca()
fig.suptitle(i, fontsize=20)
#plt.figure(i)
sns.histplot(comb_df[i], kde=True, ax=ax)
#print("")
comb_df["TotalCharges"].unique()
array([ 29.85, 1889.5 , 108.15, ..., 346.45, 306.6 , 6844.5 ])
#2B..... Converting Continuous variable column TotalCharges to float
comb_df['TotalCharges'] = pd.to_numeric(comb_df['TotalCharges'],errors='coerce')
comb_df['TotalCharges'] = comb_df['TotalCharges'].astype(float)
comb_df.dtypes
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges float64 Churn object dtype: object
# 2C........... Function to accept dataframe and then print pie charts for all Categorical columns
def pie_df(df):
for i in df.columns:
if (df[i].dtypes == "object"):
if i in ["customerID","TotalCharges"]:
print()
else:
#print("column is %S",i)
df.groupby(i).size().plot(kind='pie', subplots=True, shadow=True, startangle=30, figsize=(8,6), autopct='%1.2f%%')
font1 = {'family':'serif','color':'blue','size':20}
plt.title(i, fontdict = font1)
plt.tight_layout()
plt.show()
pie_df(comb_df)
There are totally 19 categorical value columns
21% in any case have no Internetservice
# 2E... Encoding all the Categorical variables
#classifying the categorical variables into 2 variants (Label encoding & One Hot encoding)
replaceStruct = {
"MultipleLines": {"Yes": 1, "No": 0, "No phone service": -1},
"OnlineSecurity": {"Yes": 1, "No": 0 ,"No internet service": -1},
"OnlineBackup": {"Yes": 1, "No": 0 ,"No internet service": -1},
"DeviceProtection": {"Yes": 1, "No": 0 ,"No internet service": -1},
"TechSupport": {"Yes": 1, "No": 0 ,"No internet service": -1},
"StreamingTV": {"Yes": 1, "No": 0 ,"No internet service": -1},
"StreamingMovies": {"Yes": 1, "No": 0 ,"No internet service": -1},
"Contract": {"Month-to-month": 1, "One year": 2 ,"Two year": 3},
"Churn": {"Yes": 1, "No": 0 },
"Partner": {"Yes": 1, "No": 0 },
"Dependents": {"Yes": 1, "No": 0 },
"PhoneService": {"Yes": 1, "No": 0 },
"PaperlessBilling": {"Yes": 1, "No": 0 }
}
oneHotCols=["PaymentMethod","InternetService","gender"]
replaceStruct
{'MultipleLines': {'Yes': 1, 'No': 0, 'No phone service': -1},
'OnlineSecurity': {'Yes': 1, 'No': 0, 'No internet service': -1},
'OnlineBackup': {'Yes': 1, 'No': 0, 'No internet service': -1},
'DeviceProtection': {'Yes': 1, 'No': 0, 'No internet service': -1},
'TechSupport': {'Yes': 1, 'No': 0, 'No internet service': -1},
'StreamingTV': {'Yes': 1, 'No': 0, 'No internet service': -1},
'StreamingMovies': {'Yes': 1, 'No': 0, 'No internet service': -1},
'Contract': {'Month-to-month': 1, 'One year': 2, 'Two year': 3},
'Churn': {'Yes': 1, 'No': 0},
'Partner': {'Yes': 1, 'No': 0},
'Dependents': {'Yes': 1, 'No': 0},
'PhoneService': {'Yes': 1, 'No': 0},
'PaperlessBilling': {'Yes': 1, 'No': 0}}
oneHotCols
['PaymentMethod', 'InternetService', 'gender']
new_df=comb_df.replace(replaceStruct)
new_df=pd.get_dummies(new_df, columns=oneHotCols)
new_df.head(10)
| customerID | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | OnlineSecurity | OnlineBackup | DeviceProtection | ... | Churn | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | InternetService_DSL | InternetService_Fiber optic | InternetService_No | gender_Female | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | 0 | 1 | 0 | 1 | 0 | -1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| 1 | 5575-GNVDE | 0 | 0 | 0 | 34 | 1 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 2 | 3668-QPYBK | 0 | 0 | 0 | 2 | 1 | 0 | 1 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 3 | 7795-CFOCW | 0 | 0 | 0 | 45 | 0 | -1 | 1 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 4 | 9237-HQITU | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 5 | 9305-CDSKC | 0 | 0 | 0 | 8 | 1 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 6 | 1452-KIOVK | 0 | 0 | 1 | 22 | 1 | 1 | 0 | 1 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 7 | 6713-OKOMC | 0 | 0 | 0 | 10 | 0 | -1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 |
| 8 | 7892-POOKP | 0 | 1 | 0 | 28 | 1 | 1 | 0 | 0 | 1 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| 9 | 6388-TABGU | 0 | 0 | 1 | 62 | 1 | 0 | 1 | 1 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
10 rows × 27 columns
for i in new_df.columns:
if ((i in ["customerID","tenure","MonthlyCharges","TotalCharges"])):
print()
else:
print("% S unique values", i)
print(new_df[i].value_counts())
print("")
% S unique values SeniorCitizen 0 5901 1 1142 Name: SeniorCitizen, dtype: int64 % S unique values Partner 0 3641 1 3402 Name: Partner, dtype: int64 % S unique values Dependents 0 4933 1 2110 Name: Dependents, dtype: int64 % S unique values PhoneService 1 6361 0 682 Name: PhoneService, dtype: int64 % S unique values MultipleLines 0 3390 1 2971 -1 682 Name: MultipleLines, dtype: int64 % S unique values OnlineSecurity 0 3498 1 2019 -1 1526 Name: OnlineSecurity, dtype: int64 % S unique values OnlineBackup 0 3088 1 2429 -1 1526 Name: OnlineBackup, dtype: int64 % S unique values DeviceProtection 0 3095 1 2422 -1 1526 Name: DeviceProtection, dtype: int64 % S unique values TechSupport 0 3473 1 2044 -1 1526 Name: TechSupport, dtype: int64 % S unique values StreamingTV 0 2810 1 2707 -1 1526 Name: StreamingTV, dtype: int64 % S unique values StreamingMovies 0 2785 1 2732 -1 1526 Name: StreamingMovies, dtype: int64 % S unique values Contract 1 3875 3 1695 2 1473 Name: Contract, dtype: int64 % S unique values PaperlessBilling 1 4171 0 2872 Name: PaperlessBilling, dtype: int64 % S unique values Churn 0 5174 1 1869 Name: Churn, dtype: int64 % S unique values PaymentMethod_Bank transfer (automatic) 0 5499 1 1544 Name: PaymentMethod_Bank transfer (automatic), dtype: int64 % S unique values PaymentMethod_Credit card (automatic) 0 5521 1 1522 Name: PaymentMethod_Credit card (automatic), dtype: int64 % S unique values PaymentMethod_Electronic check 0 4678 1 2365 Name: PaymentMethod_Electronic check, dtype: int64 % S unique values PaymentMethod_Mailed check 0 5431 1 1612 Name: PaymentMethod_Mailed check, dtype: int64 % S unique values InternetService_DSL 0 4622 1 2421 Name: InternetService_DSL, dtype: int64 % S unique values InternetService_Fiber optic 0 3947 1 3096 Name: InternetService_Fiber optic, dtype: int64 % S unique values InternetService_No 0 5517 1 1526 Name: InternetService_No, dtype: int64 % S unique values gender_Female 0 3555 1 3488 Name: gender_Female, dtype: int64 % S unique values gender_Male 1 3555 0 3488 Name: gender_Male, dtype: int64
new_df
| customerID | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | OnlineSecurity | OnlineBackup | DeviceProtection | ... | Churn | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | InternetService_DSL | InternetService_Fiber optic | InternetService_No | gender_Female | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7590-VHVEG | 0 | 1 | 0 | 1 | 0 | -1 | 0 | 1 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| 1 | 5575-GNVDE | 0 | 0 | 0 | 34 | 1 | 0 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 2 | 3668-QPYBK | 0 | 0 | 0 | 2 | 1 | 0 | 1 | 1 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 3 | 7795-CFOCW | 0 | 0 | 0 | 45 | 0 | -1 | 1 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 4 | 9237-HQITU | 0 | 0 | 0 | 2 | 1 | 0 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 7038 | 6840-RESVB | 0 | 1 | 1 | 24 | 1 | 1 | 1 | 0 | 1 | ... | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 7039 | 2234-XADUH | 0 | 1 | 1 | 72 | 1 | 1 | 0 | 1 | 1 | ... | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 |
| 7040 | 4801-JZAZL | 0 | 1 | 1 | 11 | 0 | -1 | 1 | 0 | 0 | ... | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 |
| 7041 | 8361-LTMKD | 1 | 1 | 0 | 4 | 1 | 1 | 0 | 0 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 |
| 7042 | 3186-AJIEK | 0 | 0 | 0 | 66 | 1 | 0 | 1 | 0 | 1 | ... | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
7043 rows × 27 columns
new_df.shape
(7043, 27)
# From 21 columns it has increased to 27 since there are new columns added for Payment Method, Internet service & gender
# mentioned in One Hot encoding
new_df.corr()
| SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | ... | Churn | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | InternetService_DSL | InternetService_Fiber optic | InternetService_No | gender_Female | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SeniorCitizen | 1.000000 | 0.016479 | -0.211185 | 0.016567 | 0.008576 | 0.113791 | 0.081878 | 0.144828 | 0.140298 | 0.067457 | ... | 0.150889 | -0.016159 | -0.024135 | 0.171718 | -0.153477 | -0.108322 | 0.255338 | -0.182742 | 0.001874 | -0.001874 |
| Partner | 0.016479 | 1.000000 | 0.452676 | 0.379697 | 0.017706 | 0.117307 | 0.091303 | 0.090753 | 0.098664 | 0.076551 | ... | -0.150448 | 0.110706 | 0.082029 | -0.083852 | -0.095125 | -0.000851 | 0.000304 | 0.000615 | 0.001808 | -0.001808 |
| Dependents | -0.211185 | 0.452676 | 1.000000 | 0.159712 | -0.001762 | -0.019657 | -0.029721 | -0.062775 | -0.069078 | -0.040788 | ... | -0.164221 | 0.052021 | 0.060267 | -0.150642 | 0.059071 | 0.052010 | -0.165818 | 0.139812 | -0.010517 | 0.010517 |
| tenure | 0.016567 | 0.379697 | 0.159712 | 1.000000 | 0.008448 | 0.258958 | 0.232373 | 0.253743 | 0.253999 | 0.230522 | ... | -0.352229 | 0.243510 | 0.233006 | -0.208363 | -0.233852 | 0.013274 | 0.019720 | -0.039062 | -0.005106 | 0.005106 |
| PhoneService | 0.008576 | 0.017706 | -0.001762 | 0.008448 | 1.000000 | 0.675070 | -0.159989 | -0.129770 | -0.142012 | -0.161930 | ... | 0.011942 | 0.007556 | -0.007721 | 0.003062 | -0.003319 | -0.452425 | 0.289999 | 0.172209 | 0.006488 | -0.006488 |
| MultipleLines | 0.113791 | 0.117307 | -0.019657 | 0.258958 | 0.675070 | 1.000000 | 0.069085 | 0.130619 | 0.124502 | 0.069149 | ... | 0.036310 | 0.061513 | 0.042590 | 0.065663 | -0.176117 | -0.361806 | 0.414749 | -0.082560 | 0.009451 | -0.009451 |
| OnlineSecurity | 0.081878 | 0.091303 | -0.029721 | 0.232373 | -0.159989 | 0.069085 | 1.000000 | 0.705166 | 0.701690 | 0.735191 | ... | 0.023309 | 0.062184 | 0.073520 | 0.094304 | -0.239277 | 0.427891 | 0.252145 | -0.797084 | 0.014418 | -0.014418 |
| OnlineBackup | 0.144828 | 0.090753 | -0.062775 | 0.253743 | -0.129770 | 0.130619 | 0.705166 | 1.000000 | 0.711335 | 0.709216 | ... | 0.074205 | 0.057192 | 0.057873 | 0.158719 | -0.291444 | 0.314037 | 0.366555 | -0.803663 | 0.012230 | -0.012230 |
| DeviceProtection | 0.140298 | 0.098664 | -0.069078 | 0.253999 | -0.142012 | 0.124502 | 0.701690 | 0.711335 | 1.000000 | 0.725327 | ... | 0.084654 | 0.054689 | 0.071243 | 0.156932 | -0.300069 | 0.306717 | 0.373425 | -0.803500 | 0.004720 | -0.004720 |
| TechSupport | 0.067457 | 0.076551 | -0.040788 | 0.230522 | -0.161930 | 0.069149 | 0.735191 | 0.709216 | 0.725327 | 1.000000 | ... | 0.027037 | 0.066123 | 0.074562 | 0.092152 | -0.241759 | 0.422120 | 0.257847 | -0.797300 | 0.009409 | -0.009409 |
| StreamingTV | 0.167188 | 0.079785 | -0.086739 | 0.201053 | -0.108239 | 0.165254 | 0.662220 | 0.703167 | 0.748182 | 0.703866 | ... | 0.164673 | 0.030875 | 0.025425 | 0.248025 | -0.334131 | 0.217634 | 0.465190 | -0.811354 | 0.008673 | -0.008673 |
| StreamingMovies | 0.176469 | 0.075098 | -0.101475 | 0.205031 | -0.114715 | 0.162841 | 0.667033 | 0.700158 | 0.752952 | 0.704587 | ... | 0.163220 | 0.032405 | 0.030648 | 0.243398 | -0.335553 | 0.223262 | 0.460472 | -0.812158 | 0.010011 | -0.010011 |
| Contract | -0.142554 | 0.294806 | 0.243187 | 0.671607 | 0.002247 | 0.083343 | 0.015824 | -0.035407 | 0.005848 | 0.047218 | ... | -0.396713 | 0.186440 | 0.210659 | -0.342575 | -0.004882 | 0.055352 | -0.254157 | 0.242388 | -0.000126 | 0.000126 |
| PaperlessBilling | 0.156530 | -0.014877 | -0.111377 | 0.006152 | 0.016505 | 0.133255 | 0.184993 | 0.260715 | 0.246069 | 0.211031 | ... | 0.191825 | -0.016332 | -0.013589 | 0.208865 | -0.205398 | -0.063121 | 0.326853 | -0.321013 | 0.011754 | -0.011754 |
| MonthlyCharges | 0.220173 | 0.096848 | -0.113890 | 0.247900 | 0.247398 | 0.490700 | 0.635534 | 0.710477 | 0.737104 | 0.661032 | ... | 0.193356 | 0.042812 | 0.030550 | 0.271625 | -0.377437 | -0.160189 | 0.787066 | -0.763557 | 0.014569 | -0.014569 |
| TotalCharges | 0.102652 | 0.318364 | 0.063593 | 0.825464 | 0.113013 | 0.412166 | 0.482605 | 0.537169 | 0.545529 | 0.495075 | ... | -0.199037 | 0.186025 | 0.182745 | -0.059971 | -0.294814 | -0.052279 | 0.361045 | -0.374706 | 0.000002 | -0.000002 |
| Churn | 0.150889 | -0.150448 | -0.164221 | -0.352229 | 0.011942 | 0.036310 | 0.023309 | 0.074205 | 0.084654 | 0.027037 | ... | 1.000000 | -0.117937 | -0.134302 | 0.301919 | -0.091683 | -0.124214 | 0.308020 | -0.227890 | 0.008612 | -0.008612 |
| PaymentMethod_Bank transfer (automatic) | -0.016159 | 0.110706 | 0.052021 | 0.243510 | 0.007556 | 0.061513 | 0.062184 | 0.057192 | 0.054689 | 0.066123 | ... | -0.117937 | 1.000000 | -0.278215 | -0.376762 | -0.288685 | 0.025476 | -0.022624 | -0.002113 | 0.016024 | -0.016024 |
| PaymentMethod_Credit card (automatic) | -0.024135 | 0.082029 | 0.060267 | 0.233006 | -0.007721 | 0.042590 | 0.073520 | 0.057873 | 0.071243 | 0.074562 | ... | -0.134302 | -0.278215 | 1.000000 | -0.373322 | -0.286049 | 0.051438 | -0.050077 | 0.001030 | -0.001215 | 0.001215 |
| PaymentMethod_Electronic check | 0.171718 | -0.083852 | -0.150642 | -0.208363 | 0.003062 | 0.065663 | 0.094304 | 0.158719 | 0.156932 | 0.092152 | ... | 0.301919 | -0.376762 | -0.373322 | 1.000000 | -0.387372 | -0.104418 | 0.336410 | -0.284917 | -0.000752 | 0.000752 |
| PaymentMethod_Mailed check | -0.153477 | -0.095125 | 0.059071 | -0.233852 | -0.003319 | -0.176117 | -0.239277 | -0.291444 | -0.300069 | -0.241759 | ... | -0.091683 | -0.288685 | -0.286049 | -0.387372 | 1.000000 | 0.041899 | -0.306834 | 0.321361 | -0.013744 | 0.013744 |
| InternetService_DSL | -0.108322 | -0.000851 | 0.052010 | 0.013274 | -0.452425 | -0.361806 | 0.427891 | 0.314037 | 0.306717 | 0.422120 | ... | -0.124214 | 0.025476 | 0.051438 | -0.104418 | 0.041899 | 1.000000 | -0.640987 | -0.380635 | -0.006568 | 0.006568 |
| InternetService_Fiber optic | 0.255338 | 0.000304 | -0.165818 | 0.019720 | 0.289999 | 0.414749 | 0.252145 | 0.366555 | 0.373425 | 0.257847 | ... | 0.308020 | -0.022624 | -0.050077 | 0.336410 | -0.306834 | -0.640987 | 1.000000 | -0.465793 | 0.011286 | -0.011286 |
| InternetService_No | -0.182742 | 0.000615 | 0.139812 | -0.039062 | 0.172209 | -0.082560 | -0.797084 | -0.803663 | -0.803500 | -0.797300 | ... | -0.227890 | -0.002113 | 0.001030 | -0.284917 | 0.321361 | -0.380635 | -0.465793 | 1.000000 | -0.006026 | 0.006026 |
| gender_Female | 0.001874 | 0.001808 | -0.010517 | -0.005106 | 0.006488 | 0.009451 | 0.014418 | 0.012230 | 0.004720 | 0.009409 | ... | 0.008612 | 0.016024 | -0.001215 | -0.000752 | -0.013744 | -0.006568 | 0.011286 | -0.006026 | 1.000000 | -1.000000 |
| gender_Male | -0.001874 | -0.001808 | 0.010517 | 0.005106 | -0.006488 | -0.009451 | -0.014418 | -0.012230 | -0.004720 | -0.009409 | ... | -0.008612 | -0.016024 | 0.001215 | 0.000752 | 0.013744 | 0.006568 | -0.011286 | 0.006026 | -1.000000 | 1.000000 |
26 rows × 26 columns
sns.set(rc={'figure.figsize':(15.7,8)})
sns.set(style="ticks", color_codes=True)
sns.heatmap(new_df.corr(), annot=True, linewidths=0.5, center=0, cbar=False, cmap="YlGnBu")
<AxesSubplot:>
# 2F...... Splitting data into 80% Train and 20% Test
X = new_df.drop(["customerID","Churn"], axis=1) #........... Independent variables,
y = new_df['Churn'] #.............. Dependent variable
X.dtypes
SeniorCitizen int64 Partner int64 Dependents int64 tenure int64 PhoneService int64 MultipleLines int64 OnlineSecurity int64 OnlineBackup int64 DeviceProtection int64 TechSupport int64 StreamingTV int64 StreamingMovies int64 Contract int64 PaperlessBilling int64 MonthlyCharges float64 TotalCharges float64 PaymentMethod_Bank transfer (automatic) uint8 PaymentMethod_Credit card (automatic) uint8 PaymentMethod_Electronic check uint8 PaymentMethod_Mailed check uint8 InternetService_DSL uint8 InternetService_Fiber optic uint8 InternetService_No uint8 gender_Female uint8 gender_Male uint8 dtype: object
X.describe()
| SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | ... | TotalCharges | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | InternetService_DSL | InternetService_Fiber optic | InternetService_No | gender_Female | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | ... | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 | 7043.000000 |
| mean | 0.162147 | 0.483033 | 0.299588 | 32.371149 | 0.903166 | 0.325004 | 0.069999 | 0.128212 | 0.127219 | 0.073548 | ... | 2281.916928 | 0.219225 | 0.216101 | 0.335794 | 0.228880 | 0.343746 | 0.439585 | 0.216669 | 0.495244 | 0.504756 |
| std | 0.368612 | 0.499748 | 0.458110 | 24.559481 | 0.295752 | 0.642730 | 0.706051 | 0.738369 | 0.737868 | 0.708201 | ... | 2265.270398 | 0.413751 | 0.411613 | 0.472301 | 0.420141 | 0.474991 | 0.496372 | 0.412004 | 0.500013 | 0.500013 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | -1.000000 | ... | 18.800000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.000000 | 0.000000 | 0.000000 | 9.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 402.225000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 50% | 0.000000 | 0.000000 | 0.000000 | 29.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 1397.475000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 75% | 0.000000 | 1.000000 | 1.000000 | 55.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 3786.600000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 |
| max | 1.000000 | 1.000000 | 1.000000 | 72.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 8684.800000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 25 columns
from sklearn import preprocessing
X_standard = preprocessing.scale(X)
X_standard
array([[-0.43991649, 1.03453023, -0.65401193, ..., -0.52592737,
1.00955867, -1.00955867],
[-0.43991649, -0.96662231, -0.65401193, ..., -0.52592737,
-0.99053183, 0.99053183],
[-0.43991649, -0.96662231, -0.65401193, ..., -0.52592737,
-0.99053183, 0.99053183],
...,
[-0.43991649, 1.03453023, 1.5290241 , ..., -0.52592737,
1.00955867, -1.00955867],
[ 2.27315869, 1.03453023, -0.65401193, ..., -0.52592737,
-0.99053183, 0.99053183],
[-0.43991649, -0.96662231, -0.65401193, ..., -0.52592737,
-0.99053183, 0.99053183]])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_standard, y, test_size=.20, random_state=1)
print("X_train shape",X_train.shape)
print("y_train shape",y_train.shape)
print("X_test shape",X_test.shape)
print("y_test shape",y_test.shape)
X_train shape (5634, 25) y_train shape (5634,) X_test shape (1409, 25) y_test shape (1409,)
X_train
array([[-0.43991649, 1.03453023, 1.5290241 , ..., 1.90140322,
-0.99053183, 0.99053183],
[-0.43991649, -0.96662231, -0.65401193, ..., -0.52592737,
1.00955867, -1.00955867],
[-0.43991649, 1.03453023, -0.65401193, ..., -0.52592737,
-0.99053183, 0.99053183],
...,
[-0.43991649, -0.96662231, -0.65401193, ..., -0.52592737,
-0.99053183, 0.99053183],
[-0.43991649, -0.96662231, -0.65401193, ..., -0.52592737,
-0.99053183, 0.99053183],
[-0.43991649, 1.03453023, 1.5290241 , ..., -0.52592737,
1.00955867, -1.00955867]])
#....Tree-based models care more about the order of values than about the absolute value that a feature takes.
#....Hence, normalization are skipped for decision trees and used mainly in linear models/KNN/neural networks because
#....they’re affected by absolute values taken by features.
#.......Decision trees are also not sensitive to outliers since the partitioning happens based on the proportion of samples
#.......within the split ranges and not on absolute values.
#.......Hence removing the scaling factor and going back to regulary X
X = new_df.drop(["customerID","Churn"], axis=1) #........... Independent variables,
y = new_df['Churn'] #.............. Dependent variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=1)
X_train
| SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | ... | TotalCharges | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | InternetService_DSL | InternetService_Fiber optic | InternetService_No | gender_Female | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1814 | 0 | 1 | 1 | 12 | 1 | 0 | -1 | -1 | -1 | -1 | ... | 258.35 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
| 5946 | 0 | 0 | 0 | 42 | 1 | 0 | 1 | 1 | 1 | 1 | ... | 3160.55 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 3881 | 0 | 1 | 0 | 71 | 1 | 1 | 1 | 1 | 0 | 1 | ... | 4681.75 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 2389 | 0 | 1 | 1 | 71 | 1 | 1 | 1 | 0 | 1 | 1 | ... | 6300.85 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
| 3676 | 0 | 0 | 0 | 30 | 1 | 0 | 1 | 1 | 0 | 1 | ... | 2044.75 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 905 | 1 | 0 | 0 | 9 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 918.60 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 5192 | 0 | 0 | 1 | 60 | 1 | 0 | -1 | -1 | -1 | -1 | ... | 1189.90 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
| 3980 | 0 | 0 | 0 | 28 | 1 | 1 | 0 | 1 | 1 | 0 | ... | 2979.50 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 235 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 1 | 0 | 0 | ... | 114.10 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 5157 | 0 | 1 | 1 | 16 | 1 | 0 | 1 | 1 | 0 | 1 | ... | 1114.85 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
5634 rows × 25 columns
y_train
1814 0
5946 1
3881 0
2389 0
3676 0
..
905 1
5192 0
3980 1
235 1
5157 0
Name: Churn, Length: 5634, dtype: int64
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dt_model.fit(X_train, y_train)
DecisionTreeClassifier(random_state=1)
print(dt_model.score(X_train, y_train))
print(dt_model.score(X_test, y_test))
0.9980475683351083 0.7388218594748048
y_pred_0 = dt_model.predict(X_test)
from sklearn import metrics
from sklearn.metrics import roc_auc_score
cm=metrics.confusion_matrix(y_test, y_pred_0, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (9,7))
sns.heatmap(df_cm, annot=True)
<AxesSubplot:>
print("Decision Tree - Accuracy ",metrics.accuracy_score(y_test, y_pred_0))
print("Decision Tree - Precision",metrics.precision_score(y_test, y_pred_0))
print("Decision Tree - Recall ",metrics.recall_score(y_test, y_pred_0))
print("Decision Tree - F1 score ",metrics.f1_score(y_test, y_pred_0))
print("Decision Tree - ROC ",metrics.roc_auc_score(y_test, y_pred_0))
Decision Tree - Accuracy 0.7388218594748048 Decision Tree - Precision 0.47512437810945274 Decision Tree - Recall 0.5488505747126436 Decision Tree - F1 score 0.5093333333333334 Decision Tree - ROC 0.6749907915976036
X_train
| SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | ... | TotalCharges | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | InternetService_DSL | InternetService_Fiber optic | InternetService_No | gender_Female | gender_Male | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1814 | 0 | 1 | 1 | 12 | 1 | 0 | -1 | -1 | -1 | -1 | ... | 258.35 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
| 5946 | 0 | 0 | 0 | 42 | 1 | 0 | 1 | 1 | 1 | 1 | ... | 3160.55 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 3881 | 0 | 1 | 0 | 71 | 1 | 1 | 1 | 1 | 0 | 1 | ... | 4681.75 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 |
| 2389 | 0 | 1 | 1 | 71 | 1 | 1 | 1 | 0 | 1 | 1 | ... | 6300.85 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
| 3676 | 0 | 0 | 0 | 30 | 1 | 0 | 1 | 1 | 0 | 1 | ... | 2044.75 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 905 | 1 | 0 | 0 | 9 | 1 | 1 | 0 | 0 | 1 | 0 | ... | 918.60 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 5192 | 0 | 0 | 1 | 60 | 1 | 0 | -1 | -1 | -1 | -1 | ... | 1189.90 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 |
| 3980 | 0 | 0 | 0 | 28 | 1 | 1 | 0 | 1 | 1 | 0 | ... | 2979.50 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 |
| 235 | 0 | 0 | 0 | 2 | 1 | 1 | 0 | 1 | 0 | 0 | ... | 114.10 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 |
| 5157 | 0 | 1 | 1 | 16 | 1 | 0 | 1 | 1 | 0 | 1 | ... | 1114.85 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
5634 rows × 25 columns
from sklearn.tree import export_graphviz
train_char_label = ['No', 'Yes']
Credit_Tree_File = open('credit_tree.dot','w')
dot_data = export_graphviz(dt_model, out_file=Credit_Tree_File, feature_names = list(X_train), class_names = list(train_char_label))
Credit_Tree_File.close()
from os import system
from IPython.display import Image
#Works only if "dot" command works on you machine
retCode = system("dot -Tpng credit_tree.dot -o credit_tree.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
display(Image("credit_tree.png"))
!pip install xgboost
Requirement already satisfied: xgboost in c:\users\hp\anaconda3\lib\site-packages (1.5.2) Requirement already satisfied: scipy in c:\users\hp\anaconda3\lib\site-packages (from xgboost) (1.6.2) Requirement already satisfied: numpy in c:\users\hp\anaconda3\lib\site-packages (from xgboost) (1.20.1)
# 3A.......... Train model using XGBoost
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
#from sklearn.grid_search import GridSearchCV #Perforing grid search
xgb_mod = XGBClassifier(use_label_encoder=False)
xgb_mod.fit(X_train,y_train)
[10:44:44] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
gamma=0, gpu_id=-1, importance_type=None,
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, predictor='auto', random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
tree_method='exact', use_label_encoder=False,
validate_parameters=1, verbosity=None)
print(xgb_mod.score(X_train,y_train))
print(xgb_mod.score(X_test,y_test))
0.9366347177848775 0.8019872249822569
y_pred_xgb = xgb_mod.predict(X_test)
from sklearn import metrics
from sklearn.metrics import roc_auc_score
cm=metrics.confusion_matrix(y_test, y_pred_xgb, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (9,7))
sns.heatmap(df_cm, annot=True)
print("with XGBoost - Accuracy ",metrics.accuracy_score(y_test, y_pred_xgb))
print("with XGBoost - Precision",metrics.precision_score(y_test, y_pred_xgb))
print("with XGBoost - Recall ",metrics.recall_score(y_test, y_pred_xgb))
print("with XGBoost - F1 score ",metrics.f1_score(y_test, y_pred_xgb))
print("with XGBoost - ROC ",metrics.roc_auc_score(y_test, y_pred_xgb))
perf_cmp_Df = pd.DataFrame()
temp_perf_Df = pd.DataFrame({'XGBoost Classifier':["Base model"],'Accuracy': [metrics.accuracy_score(y_test, y_pred_xgb)],'Precision': [metrics.precision_score(y_test, y_pred_xgb)],'Recall': [metrics.recall_score(y_test, y_pred_xgb)],'F1 score': [metrics.f1_score(y_test, y_pred_xgb)],'ROC score': [metrics.roc_auc_score(y_test, y_pred_xgb)]})
perf_cmp_Df = pd.concat([perf_cmp_Df, temp_perf_Df])
perf_cmp_Df = perf_cmp_Df[['XGBoost Classifier','Accuracy','Precision','Recall','F1 score','ROC score']]
perf_cmp_Df
with XGBoost - Accuracy 0.8019872249822569 with XGBoost - Precision 0.6029850746268657 with XGBoost - Recall 0.5804597701149425 with XGBoost - F1 score 0.5915080527086384 with XGBoost - ROC 0.7275531649820707
| XGBoost Classifier | Accuracy | Precision | Recall | F1 score | ROC score | |
|---|---|---|---|---|---|---|
| 0 | Base model | 0.801987 | 0.602985 | 0.58046 | 0.591508 | 0.727553 |
# 3B....... Improve performance of XGBoost
xgb_mod = XGBClassifier(
learning_rate =0.1,
n_estimators=200,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='binary:logistic',
nthread=4,
use_label_encoder=False,
scale_pos_weight=1,seed=27)
xgb_mod.fit(X_train, y_train)
print(xgb_mod.score(X_train,y_train))
print(xgb_mod.score(X_test,y_test))
y_pred_xgb = xgb_mod.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_pred_xgb, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (9,7))
sns.heatmap(df_cm, annot=True)
print("Tuning 1 XGBoost - Accuracy ",metrics.accuracy_score(y_test, y_pred_xgb))
print("Tuning 1 XGBoost - Precision",metrics.precision_score(y_test, y_pred_xgb))
print("Tuning 1 XGBoost - Recall ",metrics.recall_score(y_test, y_pred_xgb))
print("Tuning 1 XGBoost - F1 score ",metrics.f1_score(y_test, y_pred_xgb))
print("Tuning 1 XGBoost - ROC ",metrics.roc_auc_score(y_test, y_pred_xgb))
temp_perf_Df = pd.DataFrame({'XGBoost Classifier':["l_rate =0.1,esti=200,depth=5,ch_weight=1,gamma=0,s_bytree=0.8"],
'Accuracy': [metrics.accuracy_score(y_test, y_pred_xgb)],
'Precision': [metrics.precision_score(y_test, y_pred_xgb)],
'Recall': [metrics.recall_score(y_test, y_pred_xgb)],
'F1 score': [metrics.f1_score(y_test, y_pred_xgb)],
'ROC score': [metrics.roc_auc_score(y_test, y_pred_xgb)]})
perf_cmp_Df = pd.concat([perf_cmp_Df, temp_perf_Df])
perf_cmp_Df = perf_cmp_Df[['XGBoost Classifier','Accuracy','Precision','Recall','F1 score','ROC score']]
perf_cmp_Df
[10:45:23] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior. 0.88604898828541 0.8076650106458482 Tuning 1 XGBoost - Accuracy 0.8076650106458482 Tuning 1 XGBoost - Precision 0.6199376947040498 Tuning 1 XGBoost - Recall 0.5718390804597702 Tuning 1 XGBoost - F1 score 0.5949177877428998 Tuning 1 XGBoost - ROC 0.7284266090329011
| XGBoost Classifier | Accuracy | Precision | Recall | F1 score | ROC score | |
|---|---|---|---|---|---|---|
| 0 | Base model | 0.801987 | 0.602985 | 0.580460 | 0.591508 | 0.727553 |
| 0 | l_rate =0.1,esti=200,depth=5,ch_weight=1,gamma... | 0.807665 | 0.619938 | 0.571839 | 0.594918 | 0.728427 |
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
xgb_model = XGBClassifier(use_label_encoder=False,eval_metric="logloss")
#List Hyperparameters to tune
eta = [0.01, 0.05, 0.10, 0.15]
gamma = [0, 0.1, 0.2, 0.3]
max_depth = [3, 4, 5]
min_ch_wgt= [1,3,5]
col_sam_tree= [0.3,0.4,0.5]
#convert to dictionary
hyperparameters = dict(learning_rate=eta, gamma=gamma, max_depth=max_depth, min_child_weight = min_ch_wgt, colsample_bytree=col_sam_tree)
#Making model
clf = GridSearchCV(xgb_model, hyperparameters, cv=5)
best_model = clf.fit(X_train,y_train)
#Best Hyperparameters Value
print("")
print('Best learning rate:', best_model.best_estimator_.get_params()['learning_rate'])
print('Best gamma:', best_model.best_estimator_.get_params()['gamma'])
print('Best max_depth:', best_model.best_estimator_.get_params()['max_depth'])
print('Best min_child_weight:', best_model.best_estimator_.get_params()['min_child_weight'])
print('Best colsample_bytree:', best_model.best_estimator_.get_params()['colsample_bytree'])
print("")
#Predict testing set
y_pred = best_model.predict(X_test)
print("GridSearchCV - Accuracy ",metrics.accuracy_score(y_test, y_pred))
print("GridSearchCV - Precision",metrics.precision_score(y_test, y_pred, average='weighted'))
print("GridSearchCV - Recall ",metrics.recall_score(y_test, y_pred, average='weighted'))
print("GridSearchCV - F1 score ",metrics.f1_score(y_test, y_pred, average='weighted'))
Best learning rate: 0.1 Best gamma: 0.1 Best max_depth: 3 Best min_child_weight: 5 Best colsample_bytree: 0.5 GridSearchCV - Accuracy 0.8133427963094393 GridSearchCV - Precision 0.8094690029446563 GridSearchCV - Recall 0.8133427963094393 GridSearchCV - F1 score 0.811153986406265
# 3B....... Best performing hyperparameters are
# Best learning rate: 0.1
# Best gamma: 0.1
# Best max_depth: 3
# Best min_child_weight: 5
# Best colsample_bytree: 0.5
# and the best test performance is
# GridSearchCV - Accuracy 0.8133427963094393
# GridSearchCV - Precision 0.8094690029446563
# GridSearchCV - Recall 0.8133427963094393
# GridSearchCV - F1 score 0.811153986406265
xgb = XGBClassifier(
learning_rate =0.1,
n_estimators=450,
max_depth=3,
min_child_weight=5,
gamma=0.1,
subsample=0.8,
colsample_bytree=0.5,
objective='binary:logistic',
nthread=4,
use_label_encoder=False,
eval_metric="logloss",
scale_pos_weight=1,seed=27)
xgb.fit(X_train, y_train)
print(xgb.score(X_train,y_train))
print(xgb.score(X_test,y_test))
y_pred_xgb = xgb.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_pred_xgb, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (9,7))
sns.heatmap(df_cm, annot=True)
print("Tuning 2 XGBoost - Accuracy ",metrics.accuracy_score(y_test, y_pred_xgb))
print("Tuning 2 XGBoost - Precision",metrics.precision_score(y_test, y_pred_xgb))
print("Tuning 2 XGBoost - Recall ",metrics.recall_score(y_test, y_pred_xgb))
print("Tuning 2 XGBoost - F1 score ",metrics.f1_score(y_test, y_pred_xgb))
print("Tuning 2 XGBoost - ROC ",metrics.roc_auc_score(y_test, y_pred_xgb))
temp_perf_Df = pd.DataFrame({'XGBoost Classifier':["l_rate =0.0.5,esti=250,depth=3,ch_weight=3,gamma=0.2,s_bytree=0.4"],
'Accuracy': [metrics.accuracy_score(y_test, y_pred_xgb)],
'Precision': [metrics.precision_score(y_test, y_pred_xgb)],
'Recall': [metrics.recall_score(y_test, y_pred_xgb)],
'F1 score': [metrics.f1_score(y_test, y_pred_xgb)],
'ROC score': [metrics.roc_auc_score(y_test, y_pred_xgb)]})
perf_cmp_Df = pd.concat([perf_cmp_Df, temp_perf_Df])
perf_cmp_Df = perf_cmp_Df[['XGBoost Classifier','Accuracy','Precision','Recall','F1 score','ROC score']]
perf_cmp_Df
0.8462903798367057 0.8041163946061036 Tuning 2 XGBoost - Accuracy 0.8041163946061036 Tuning 2 XGBoost - Precision 0.6104294478527608 Tuning 2 XGBoost - Recall 0.5718390804597702 Tuning 2 XGBoost - F1 score 0.5905044510385757 Tuning 2 XGBoost - ROC 0.7260703413608935
| XGBoost Classifier | Accuracy | Precision | Recall | F1 score | ROC score | |
|---|---|---|---|---|---|---|
| 0 | Base model | 0.801987 | 0.602985 | 0.580460 | 0.591508 | 0.727553 |
| 0 | l_rate =0.1,esti=200,depth=5,ch_weight=1,gamma... | 0.807665 | 0.619938 | 0.571839 | 0.594918 | 0.728427 |
| 0 | l_rate =0.0.5,esti=250,depth=3,ch_weight=3,gam... | 0.805536 | 0.612805 | 0.577586 | 0.594675 | 0.728944 |
| 0 | l_rate =0.0.5,esti=250,depth=3,ch_weight=3,gam... | 0.804116 | 0.610429 | 0.571839 | 0.590504 | 0.726070 |
from sklearn.metrics import mean_squared_error
xg_reg = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
max_depth = 5, alpha = 10, n_estimators = 100)
xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))
[14:43:29] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/objective/regression_obj.cu:188: reg:linear is now deprecated in favor of reg:squarederror. RMSE: 0.354577
data_dmatrix = xgb.DMatrix(data=X,label=y)
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
'max_depth': 5, 'alpha': 10}
cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=123)
[14:43:32] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/objective/regression_obj.cu:188: reg:linear is now deprecated in favor of reg:squarederror. [14:43:32] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/objective/regression_obj.cu:188: reg:linear is now deprecated in favor of reg:squarederror. [14:43:32] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/objective/regression_obj.cu:188: reg:linear is now deprecated in favor of reg:squarederror.
cv_results.head(10)
| train-rmse-mean | train-rmse-std | test-rmse-mean | test-rmse-std | |
|---|---|---|---|---|
| 0 | 0.479932 | 0.000444 | 0.480428 | 0.000266 |
| 1 | 0.464723 | 0.001017 | 0.465395 | 0.001602 |
| 2 | 0.452284 | 0.001441 | 0.453515 | 0.000340 |
| 3 | 0.439544 | 0.001557 | 0.441052 | 0.001102 |
| 4 | 0.429754 | 0.002944 | 0.431591 | 0.002260 |
| 5 | 0.421163 | 0.001664 | 0.423180 | 0.002409 |
| 6 | 0.412995 | 0.001716 | 0.415241 | 0.002067 |
| 7 | 0.405852 | 0.001942 | 0.408474 | 0.002092 |
| 8 | 0.400392 | 0.001260 | 0.403382 | 0.002611 |
| 9 | 0.395491 | 0.002103 | 0.398697 | 0.002633 |
print((cv_results["test-rmse-mean"]).tail(1))
49 0.368023 Name: test-rmse-mean, dtype: float64
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)
[14:43:46] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/objective/regression_obj.cu:188: reg:linear is now deprecated in favor of reg:squarederror.
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 100, learning_rate = 0.05, random_state=1)
gbcl = gbcl.fit(X_train, y_train)
print(gbcl.score(X_train,y_train))
print(gbcl.score(X_test,y_test))
y_pred_4 = gbcl.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_pred_4, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (9,7))
sns.heatmap(df_cm, annot=True)
print("Trees - 100, rate=0.05 XGB - Accuracy ",metrics.accuracy_score(y_test, y_pred_4))
print("Trees - 100, rate=0.05 XGB - Precision",metrics.precision_score(y_test, y_pred_4))
print("Trees - 100, rate=0.05 XGB - Recall ",metrics.recall_score(y_test, y_pred_4))
print("Trees - 100, rate=0.05 XGB - F1 score ",metrics.f1_score(y_test, y_pred_4))
print("Trees - 100, rate=0.05 XGB - ROC ",metrics.roc_auc_score(y_test, y_pred_4))
0.8143414980475683 0.8034066713981547 Trees - 100, rate=0.05 XGB - Accuracy 0.8034066713981547 Trees - 100, rate=0.05 XGB - Precision 0.6187290969899666 Trees - 100, rate=0.05 XGB - Recall 0.5316091954022989 Trees - 100, rate=0.05 XGB - F1 score 0.5718701700154559 Trees - 100, rate=0.05 XGB - ROC 0.7120816947793775
gbcl = GradientBoostingClassifier(n_estimators = 125, learning_rate = 0.08, random_state=1)
gbcl = gbcl.fit(X_train, y_train)
print(gbcl.score(X_train,y_train))
print(gbcl.score(X_test,y_test))
y_pred_5 = gbcl.predict(X_test)
cm=metrics.confusion_matrix(y_test, y_pred_5, labels=[1, 0])
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (9,7))
sns.heatmap(df_cm, annot=True)
print("Trees - 125, rate=0.08 XGB - Accuracy ",metrics.accuracy_score(y_test, y_pred_5))
print("Trees - 125, rate=0.08 XGB - Precision",metrics.precision_score(y_test, y_pred_5))
print("Trees - 125, rate=0.08 XGB - Recall ",metrics.recall_score(y_test, y_pred_5))
print("Trees - 125, rate=0.08 XGB - F1 score ",metrics.f1_score(y_test, y_pred_5))
print("Trees - 125, rate=0.08 XGB - ROC ",metrics.roc_auc_score(y_test, y_pred_5))
0.8230386936457224 0.808374733853797 Trees - 125, rate=0.08 XGB - Accuracy 0.808374733853797 Trees - 125, rate=0.08 XGB - Precision 0.6242038216560509 Trees - 125, rate=0.08 XGB - Recall 0.5632183908045977 Trees - 125, rate=0.08 XGB - F1 score 0.5921450151057401 Trees - 125, rate=0.08 XGB - ROC 0.7260012783429208
# working only with the TelecomCustomers-Churn_2.csv as a part of modularisation.
# importing all the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
#.. importing the classifier libraries
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# importing the metrics libraries
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import mean_squared_error
# importing the tree libraries
from sklearn.tree import export_graphviz
from os import system
from IPython.display import Image
import pickle
global np,pd,plt,sns, xgb
global resultsDf
def csv(file): # called from data_under function
print("passed file",file)
global data
data = pd.read_csv(file) # loading the csv file into a dataframe
data.head(10)
return
def details(df): # called from data_under function
print("Datatypes :",df.dtypes) # printing the datatypes
print("")
print("Dataframe shape :",df.shape) # printing the shape ------ row, columns
print("")
print("Describe",df.describe()) # printing the 5 point summary
print("")
print("sample records",df.head(10))
def data_under(file):
csv(file) # passing a csv file as an argument to read into a dataframe
details(data) # printing the datatypes, shape and 5 point summary
def null_check(data): # to check if there are any missing values / nulls and fill them up with median values.
data.isnull().any().sum()
data.TotalCharges = pd.to_numeric(data.TotalCharges, errors="coerce")
data.isnull().values.any()
median = data['TotalCharges'].median() # this column has 11 missing values and is updated with median values
data['TotalCharges'].fillna(median, inplace=True) # dataframe filled with median values
return data
def show_data(data): # to check the distribution of categorical column data
for i in data.columns:
if ((i in ["customerID","MonthlyCharges","TotalCharges"])): # since they are not categorical columns
print()
else:
print("% S unique values", i)
print(data[i].value_counts())
print("")
def prep_exp(data):
null_check(data)
show_data(data)
def hist(data): # to print histograms for all the categorical columns
for i in data.columns:
if ((i in ["customerID","MonthlyCharges","TotalCharges"])): # not considering since they are not categorical
print()
else:
fig = plt.figure(figsize = (10,8))
ax = fig.gca()
plt.figure(i)
sns.histplot(data[i], kde=True, ax=ax)
def pie_new(data): # printing pie charts for all the categorical columns
for i in data.columns:
if (data[i].dtypes == "object"):
if i in ["customerID","MonthlyCharges","TotalCharges"]:
print()
else:
print("column is %S",i)
data.groupby(i).size().plot(kind='pie', subplots=True, shadow=True, startangle=30, figsize=(8,6), autopct='%1.2f%%')
plt.tight_layout()
plt.show()
def encode(data): # to encode the categorical variables
global new_data
replaceSt = {
"OnlineBackup": {"Yes": 1, "No": 0 ,"No internet service": -1},
"DeviceProtection": {"Yes": 1, "No": 0 ,"No internet service": -1},
"TechSupport": {"Yes": 1, "No": 0 ,"No internet service": -1},
"StreamingTV": {"Yes": 1, "No": 0 ,"No internet service": -1},
"StreamingMovies": {"Yes": 1, "No": 0 ,"No internet service": -1},
"Contract": {"Month-to-month": 1, "One year": 2 ,"Two year": 3},
"Churn": {"Yes": 1, "No": 0 },
"PaperlessBilling": {"Yes": 1, "No": 0 }
}
oneHot=["PaymentMethod"] # since ordering payment methods isnt possible, oneHot encoding it - creating multiple columns
new_data=data.replace(replaceSt)
new_data=pd.get_dummies(new_data, columns=oneHot) # new_data dataframe with encoded data
new_data.head(10)
return(new_data)
def cor(new_data): # function to create correlation among all the variables and printing heatmap
sns.set(rc={'figure.figsize':(15.7,8)})
sns.set(style="ticks", color_codes=True)
font1 = {'family':'serif','color':'blue','size':20}
plt.title("Corelation", fontdict = font1)
sns.heatmap(new_data.corr(), annot=True, linewidths=0.5, center=0, cbar=False, cmap="YlGnBu")
def analysis(data):
hist(data)
pie_new(data)
encode(data)
cor(new_data)
def split(new_data): # splitting the Independent and Dependent variables
global X,y,X_train1,y_train1,X_test1,y_test1
X = new_data.drop(["customerID","Churn"], axis=1) #........... Independent variables,
y = new_data['Churn'] #.............. Dependent variable
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=.20, random_state=2)
return
def export_tree(dt_model): # visualizing a tree picture for the decision tree
train_char_label = ['No', 'Yes']
Credit_Tree_File = open('IT_credit_tree.dot','w')
dot_data = export_graphviz(dt_model, out_file=Credit_Tree_File, feature_names = list(X_train1), class_names = list(train_char_label))
Credit_Tree_File.close()
retCode = system("dot -Tpng IT_credit_tree.dot -o IT_credit_tree.png")
if(retCode>0):
print("system command returning error: "+str(retCode))
else:
print("Displaying tree image")
display(Image("IT_credit_tree.png")) # displaying the tree image
def un_pikling(X_test1,y_test1): # unpickling the GradientBoosting model and predicting the dependent variable
# load model
with open('model.pkl', 'rb') as f:
loaded_classifier = pickle.load(f)
# predict
y_true = y_test1
y_pred = loaded_classifier.predict(X_test1)
print("")
print('pickle for Gradientclassifier: accuracy:', metrics.accuracy_score(y_true, y_pred))
def mod_fit(X_train1,y_train1,X_test1,y_test1): # Calling multiple classifier models to fit the training data
mod = [DecisionTreeClassifier(criterion = 'gini', random_state=2),
GradientBoostingClassifier(n_estimators = 75, learning_rate = 0.04, random_state=1),
XGBClassifier(learning_rate =0.1,n_estimators=200,max_depth=5,min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8,objective='binary:logistic',nthread=4,scale_pos_weight=1,eed=27),
KNeighborsClassifier(n_neighbors= 5, weights = 'distance' ),
LogisticRegression(solver='liblinear'),
SVC(kernel='linear')]
global resultsDf
resultsDf = pd.DataFrame()
for i in mod:
model = i
new_i = str(i) # Classification models passed need to be converetd to string
sub_i = new_i[0:new_i.find("(")] # to pick the leftmost characters before the "(" paranthesis
if sub_i == "SVC": # standardization being carried out only for SVC model
X_stan = preprocessing.scale(X)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_stan, y, test_size=.20, random_state=2)
elif sub_i == "DecisionTreeClassifier": # Tree visualization created only for Decision tree classifier
model.fit(X_train1, y_train1)
export_tree(model)
elif sub_i == "GradientBoostingClassifier":
# Since this is the best performing model creating a pickle file for it
model.fit(X_train1, y_train1)
pickle.dump(model, open('model.pkl','wb')) # writing the model to a pickle file in binary mode.
model.fit(X_train1, y_train1)
print("")
y_pred = model.predict(X_test1)
cm=metrics.confusion_matrix(y_test1, y_pred, labels=[1, 0]) # Creatng a confusion matrix to check the metrics
df_cm = pd.DataFrame(cm, index = [i for i in ["1","0"]],
columns = [i for i in ["Predict 1","Predict 0"]])
plt.figure(figsize = (9,7))
font1 = {'family':'serif','color':'blue','size':20}
plt.title(sub_i, fontdict = font1)
sns.heatmap(df_cm, annot=True, linewidths=0.5, center=0, cbar=False, cmap="YlGnBu")
# collecting the metrics after each classifier prediction.
tempResultsDf = pd.DataFrame({'Method':[sub_i],'Accuracy': [metrics.accuracy_score(y_test1, y_pred)],'Precision': [metrics.precision_score(y_test1, y_pred)],'Recall': [metrics.recall_score(y_test1, y_pred)],'F1 score': [metrics.f1_score(y_test1, y_pred)],'ROC score': [metrics.roc_auc_score(y_test1, y_pred)]})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method','Accuracy','Precision','Recall','F1 score','ROC score']]
return(resultsDf)
def final_perf(resultDf): # Checking the best performing classifier and printing metric graphs
print("")
print("Max accuracy")
print(resultsDf['Accuracy'].max())
print( resultsDf[resultsDf.loc[:,"Accuracy"] == resultsDf["Accuracy"].max() ] )
resultsDf.plot.barh(x='Method', y='Accuracy', title='Accuracy comparison', color='orange')
ax = resultsDf.plot.bar(rot=0)
def model_building(data):
split(new_data)
mod_fit(X_train1,y_train1,X_test1,y_test1)
def main(file): # Main function which calls all the sub functions
data_under(file)
prep_exp(data)
analysis(data)
model_building(data)
details(resultsDf)
final_perf(resultsDf)
un_pikling(X_test1,y_test1)
return
if __name__ == "__main__":
sys.exit(main())
main("TelcomCustomer-Churn_2.csv")
passed file TelcomCustomer-Churn_2.csv Datatypes : customerID object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges object Churn object dtype: object Dataframe shape : (7043, 12) Describe MonthlyCharges count 7043.000000 mean 64.761692 std 30.090047 min 18.250000 25% 35.500000 50% 70.350000 75% 89.850000 max 118.750000 sample records customerID OnlineBackup DeviceProtection TechSupport StreamingTV \ 0 7590-VHVEG Yes No No No 1 5575-GNVDE No Yes No No 2 3668-QPYBK Yes No No No 3 7795-CFOCW No Yes Yes No 4 9237-HQITU No No No No 5 9305-CDSKC No Yes No Yes 6 1452-KIOVK Yes No No Yes 7 6713-OKOMC No No No No 8 7892-POOKP No Yes Yes Yes 9 6388-TABGU Yes No No No StreamingMovies Contract PaperlessBilling PaymentMethod \ 0 No Month-to-month Yes Electronic check 1 No One year No Mailed check 2 No Month-to-month Yes Mailed check 3 No One year No Bank transfer (automatic) 4 No Month-to-month Yes Electronic check 5 Yes Month-to-month Yes Electronic check 6 No Month-to-month Yes Credit card (automatic) 7 No Month-to-month No Mailed check 8 Yes Month-to-month Yes Electronic check 9 No One year No Bank transfer (automatic) MonthlyCharges TotalCharges Churn 0 29.85 29.85 No 1 56.95 1889.5 No 2 53.85 108.15 Yes 3 42.30 1840.75 No 4 70.70 151.65 Yes 5 99.65 820.5 Yes 6 89.10 1949.4 No 7 29.75 301.9 No 8 104.80 3046.05 Yes 9 56.15 3487.95 No % S unique values OnlineBackup No 3088 Yes 2429 No internet service 1526 Name: OnlineBackup, dtype: int64 % S unique values DeviceProtection No 3095 Yes 2422 No internet service 1526 Name: DeviceProtection, dtype: int64 % S unique values TechSupport No 3473 Yes 2044 No internet service 1526 Name: TechSupport, dtype: int64 % S unique values StreamingTV No 2810 Yes 2707 No internet service 1526 Name: StreamingTV, dtype: int64 % S unique values StreamingMovies No 2785 Yes 2732 No internet service 1526 Name: StreamingMovies, dtype: int64 % S unique values Contract Month-to-month 3875 Two year 1695 One year 1473 Name: Contract, dtype: int64 % S unique values PaperlessBilling Yes 4171 No 2872 Name: PaperlessBilling, dtype: int64 % S unique values PaymentMethod Electronic check 2365 Mailed check 1612 Bank transfer (automatic) 1544 Credit card (automatic) 1522 Name: PaymentMethod, dtype: int64 % S unique values Churn No 5174 Yes 1869 Name: Churn, dtype: int64 column is %S OnlineBackup
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
<Figure size 432x288 with 0 Axes>
column is %S DeviceProtection
column is %S TechSupport
column is %S StreamingTV
column is %S StreamingMovies
column is %S Contract
column is %S PaperlessBilling
column is %S PaymentMethod
column is %S Churn
Displaying tree image
[15:19:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:576:
Parameters: { "eed" } might not be used.
This could be a false alarm, with some parameters getting used by language bindings but
then being mistakenly passed down to XGBoost core, or some parameter actually being used
but getting flagged wrongly here. Please open an issue if you find any such cases.
[15:19:34] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.5.1/src/learner.cc:1115: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
C:\Users\HP\anaconda3\lib\site-packages\xgboost\sklearn.py:1224: UserWarning: The use of label encoder in XGBClassifier is deprecated and will be removed in a future release. To remove this warning, do the following: 1) Pass option use_label_encoder=False when constructing XGBClassifier object; and 2) Encode your labels (y) as integers starting with 0, i.e. 0, 1, 2, ..., [num_class - 1]. warnings.warn(label_encoder_deprecation_msg, UserWarning)
Datatypes : Method object
Accuracy float64
Precision float64
Recall float64
F1 score float64
ROC score float64
dtype: object
Dataframe shape : (6, 6)
Describe Accuracy Precision Recall F1 score ROC score
count 6.000000 6.000000 6.000000 6.000000 6.000000
mean 0.780932 0.582531 0.463123 0.512198 0.674147
std 0.030644 0.080497 0.032123 0.028070 0.016875
min 0.724627 0.449239 0.422414 0.475728 0.652048
25% 0.775195 0.557962 0.438937 0.488287 0.660039
50% 0.792051 0.600701 0.468391 0.526427 0.682185
75% 0.794535 0.607364 0.478448 0.532582 0.686323
max 0.811923 0.691244 0.508621 0.534400 0.688105
sample records Method Accuracy Precision Recall F1 score \
0 DecisionTreeClassifier 0.724627 0.449239 0.508621 0.477089
0 GradientBoostingClassifier 0.811923 0.691244 0.431034 0.530973
0 XGBClassifier 0.794890 0.608856 0.474138 0.533118
0 KNeighborsClassifier 0.770050 0.544444 0.422414 0.475728
0 LogisticRegression 0.790632 0.598513 0.462644 0.521880
0 SVC 0.793471 0.602888 0.479885 0.534400
ROC score
0 0.652048
0 0.683943
0 0.687116
0 0.653243
0 0.680426
0 0.688105
Max accuracy
0.8119233498935415
Method Accuracy Precision Recall F1 score \
0 GradientBoostingClassifier 0.811923 0.691244 0.431034 0.530973
ROC score
0 0.683943
pickle for Gradientclassifier: accuracy: 0.8119233498935415